#imports libaries
import pandas as pd
import numpy as np
import os
import datetime as dt
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
#get current working directory
cwd = os.getcwd()
#create a dataframe from csv file
df = pd.read_csv("airways customer data.csv")
df.head()
| Unnamed: 0 | reviews | rates | date | country | |
|---|---|---|---|---|---|
| 0 | 0 | ✅ Trip Verified | Excellent service both on th... | \n\t\t\t\t\t\t\t\t\t\t\t\t\t5 | 24th January 2023 | United Kingdom |
| 1 | 1 | ✅ Trip Verified | Good lounge at Cape Town. O... | 9 | 24th January 2023 | United Kingdom |
| 2 | 2 | ✅ Trip Verified | A really excellent journey.... | 9 | 23rd January 2023 | United Kingdom |
| 3 | 3 | ✅ Trip Verified | This flight was one of the ... | 9 | 21st January 2023 | United Kingdom |
| 4 | 4 | Not Verified | It seems that there is a race t... | 2 | 19th January 2023 | United States |
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']
0 True
1 True
2 True
3 True
4 False
...
3413 False
3414 False
3415 False
3416 False
3417 False
Name: verified, Length: 3418, dtype: bool
df.shape
(3418, 6)
df.columns
Index(['Unnamed: 0', 'reviews', 'rates', 'date', 'country', 'verified'], dtype='object')
df.describe()
| Unnamed: 0 | |
|---|---|
| count | 3418.000000 |
| mean | 1708.500000 |
| std | 986.835937 |
| min | 0.000000 |
| 25% | 854.250000 |
| 50% | 1708.500000 |
| 75% | 2562.750000 |
| max | 3417.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3418 entries, 0 to 3417 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 3418 non-null int64 1 reviews 3418 non-null object 2 rates 3418 non-null object 3 date 3418 non-null object 4 country 3416 non-null object 5 verified 3418 non-null bool dtypes: bool(1), int64(1), object(4) memory usage: 137.0+ KB
#Cleaning/Format date
df.dtypes
Unnamed: 0 int64 reviews object rates object date object country object verified bool dtype: object
df.date = pd.to_datetime(df.date)
df.date.head()
0 2023-01-24 1 2023-01-24 2 2023-01-23 3 2023-01-21 4 2023-01-19 Name: date, dtype: datetime64[ns]
#Cleaning ratings with stars
#check for unique values
df.rates.unique()
array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '9', '2', '3', '1', '4', '5', '8',
'6', '7', '10', 'None'], dtype=object)
# remove the \t and \n from the ratings
df.rates = df.rates.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")
df.rates.value_counts()
1 745 2 385 3 380 8 344 10 303 7 297 9 293 5 255 4 232 6 179 None 5 Name: rates, dtype: int64
#There are 5 rows having values "None" in the ratings. We will drop all these 5 rows.
# drop the rows where the value of ratings is None
df.drop(df[df.rates == "None"].index, axis=0, inplace=True)
#check the unique values again
df.rates.unique()
array(['5', '9', '2', '3', '1', '4', '8', '6', '7', '10'], dtype=object)
#Check for null Values
df.isnull().value_counts()
Unnamed: 0 reviews rates date country verified
False False False False False False 3411
True False 2
dtype: int64
df.country.isnull().value_counts()
False 3411 True 2 Name: country, dtype: int64
#We have two missing values for country. For this we can just remove those two reviews (rows) from the dataframe
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)
df.shape
(3411, 6)
df.isnull().value_counts()
Unnamed: 0 reviews rates date country verified False False False False False False 3411 dtype: int64
#resetting the index
df.reset_index(drop=True)
| Unnamed: 0 | reviews | rates | date | country | verified | |
|---|---|---|---|---|---|---|
| 0 | 0 | ✅ Trip Verified | Excellent service both on th... | 5 | 2023-01-24 | United Kingdom | True |
| 1 | 1 | ✅ Trip Verified | Good lounge at Cape Town. O... | 9 | 2023-01-24 | United Kingdom | True |
| 2 | 2 | ✅ Trip Verified | A really excellent journey.... | 9 | 2023-01-23 | United Kingdom | True |
| 3 | 3 | ✅ Trip Verified | This flight was one of the ... | 9 | 2023-01-21 | United Kingdom | True |
| 4 | 4 | Not Verified | It seems that there is a race t... | 2 | 2023-01-19 | United States | False |
| ... | ... | ... | ... | ... | ... | ... |
| 3406 | 3413 | Flew British Airways from Sydney to Venice on ... | 6 | 2014-05-20 | Australia | False |
| 3407 | 3414 | Recently for our honeymoon we took advantage o... | 10 | 2014-05-19 | United Kingdom | False |
| 3408 | 3415 | LGW-Antigua on 29 April and Antigua-LGW on 13 ... | 1 | 2014-05-18 | United Kingdom | False |
| 3409 | 3416 | LHR-JTR-LHR. A tale of opposites for BA's new ... | 4 | 2014-05-18 | United Kingdom | False |
| 3410 | 3417 | Gatwick-Venice return. Two contrasting experie... | 2 | 2014-05-18 | United Kingdom | False |
3411 rows × 6 columns
#What is the average overall rating given for British Airways?
df.rates.mean()
inf
df.rates.value_counts().plot(kind="bar")
plt.xlabel("Ratings")
plt.ylabel("Total Number of reviews with that rating")
plt.suptitle("Counts for each ratings")
Text(0.5, 0.98, 'Counts for each ratings')
df_ratings = pd.DataFrame(df.rates.value_counts())
pct_values = (df_ratings.rates.values/ df_ratings.rates.values.sum() *100).tolist()
pct_values = [round(x,2) for x in pct_values]
df_ratings['pct_values'] = pct_values
# renaming columns
df_ratings.rename(columns={'index':'rates', 'rates':'total_counts'}, inplace=True)
df_ratings
| total_counts | pct_values | |
|---|---|---|
| 1 | 745 | 21.84 |
| 2 | 385 | 11.29 |
| 3 | 380 | 11.14 |
| 8 | 343 | 10.06 |
| 10 | 302 | 8.85 |
| 7 | 297 | 8.71 |
| 9 | 293 | 8.59 |
| 5 | 255 | 7.48 |
| 4 | 232 | 6.80 |
| 6 | 179 | 5.25 |
# Unique countries BA recieved the reviews from
print(f"{len(df.country.unique())} unique countries")
69 unique countries
#Which country most review comes from?
df_country_review = pd.DataFrame(df.country.value_counts().head()).reset_index()
df_country_review.rename(columns={'index':'country','country':'total_reviews'}, inplace=True)
df_country_review.plot(kind="bar", x='country')
plt.title("Maximum number of review by country")
Text(0.5, 1.0, 'Maximum number of review by country')
#convert the date datatype to datetime
df.date = pd.to_datetime(df.date)
fig = px.line(df, x='date', y="rates")
fig.update_xaxes(rangeslider_visible=True)
fig.show()